Have the required packages installed and create a data folder to store the data files required for compiling this file.
Note that Bioconductor is considered insatlled by default.
# For finding experiments of interest =========================================
if (!requireNamespace("GEOmetadb", quietly = TRUE))
BiocManager::install("GEOmetadb", ask = FALSE)
if (!requireNamespace("GEOquery", quietly = TRUE))
BiocManager::install("GEOquery", ask = FALSE)
# For HUGO symbol mapping =====================================================
# Use biomaRt database to map identifers
if (!requireNamespace("biomaRt", quietly = TRUE))
BiocManager::install("biomaRt", ask = FALSE)
# Use org.Hs.eg.db database to map identifiers
if (!requireNamespace("annotate", quietly = TRUE))
BiocManager::install("annotate", ask = FALSE)
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE))
BiocManager::install("org.Hs.eg.db", ask = FALSE)
library(org.Hs.eg.db) # Attach database
# For data processing =========================================================
if (!requireNamespace("edgeR", quietly = TRUE))
BiocManager::install("edgeR", ask = FALSE)
# Where we put all the data in ================================================
if(!dir.exists("./data")) {
dir.create("./data", showWarnings = FALSE)
}
# Download GEOmeta database if not found
if(!file.exists("./data/GEOmetadb.sqlite"))
GEOmetadb::getSQLiteFile(destdir = "./data")
# Establish connection to GEOmetadb.sqlite
con <- DBI::dbConnect(RSQLite::SQLite(), "./data/GEOmetadb.sqlite")
Building an SQL query to find a list of experiments of interests.
query <- paste(
"SELECT DISTINCT",
"gse.title,",
"gse.gse,",
"gpl.title AS platform,",
"gse.submission_date,",
"gse.supplementary_file",
"FROM",
"gse JOIN gse_gpl ON gse_gpl.gse=gse.gse",
"JOIN gpl ON gse_gpl.gpl=gpl.gpl",
"WHERE",
"gse.submission_date > '2015-01-01' AND", # not older than 6 years
"gse.title LIKE '%SARS-CoV-2%' AND ", # something about covid
"gpl.organism LIKE '%Homo sapiens%' AND", # a dataset human cells or tissue
"gpl.technology LIKE '%high-throughput seq%' ", # RNA-seq data
sep = " ")
Query GEOmeta database:
GEOresult <- DBI::dbGetQuery(conn = con, statement = query)
DBI::dbDisconnect(con) # close connection
rm(con)
# Look for experiments that provide RNA-seq count files
hasCounts_files <- GEOresult$supplementary_file[
grep(
GEOresult$supplementary_file,
pattern = "count",
ignore.case = TRUE
)
]
# Using Regex to extract GEO series of such experiments
hasCounts_gse <- unlist(
regmatches(hasCounts_files,
regexec("GSE[0-9]{4,}[^/]", hasCounts_files)
)
)
SELECT_ROWS <- GEOresult$gse %in% hasCounts_gse
candidate_dataset <- GEOresult[SELECT_ROWS, ]
candidate_dataset[ , 1:2]
Our dataset of interest is associated with experiment GSE152641. To download the data files in R
# Get dataset by GSE accession
series <- "GSE152641"
# Since there is only one supplementary file:
fname <- GEOquery::getGEOSuppFiles(GEO = series,
fetch_files = FALSE,
makeDirectory = FALSE)$fname
# Download dataset if it doesn't exist in the data directory
if (!file.exists(file.path(getwd(), "data", fname))) {
GEOquery::getGEOSuppFiles(GEO = series,
baseDir = "./data",
makeDirectory = FALSE)
}
Load data from the GEO supplementary file. Note that this is a csv file.
gene_counts <- read.csv(
file = file.path(getwd(), "data", fname),
header = TRUE,
check.names = FALSE
)
# Note that these are Entrez ID's
colnames(gene_counts)[1] <- "entrezgene_id"
Since the first column represents the Entrez ID of each gene, we need to map them back to their HUGO symbols for analysis.
Mapping Entrez ID’s to HUGO symbols:
# Choose the dataset for Homo sapiens
mart <- biomaRt::useMart("ensembl",
dataset = "hsapiens_gene_ensembl")
entrez_to_hugo_biomaRt <- biomaRt::getBM(
mart = mart,
attributes = c("entrezgene_id", "hgnc_symbol"), # Entrez ID to HUGO symbol
filters = "entrezgene_id", # filter by the Entrez ID's in the data
values = gene_counts$entrezgene_id
)
There are duplicates in the mapping, which map some entrez ID’s to rows of blank spaces. Remove rows of blank spaces before mapping:
# Remove rows that have no HUGO symbols mapped to Entrez ID's
entrez_to_hugo_biomaRt <- entrez_to_hugo_biomaRt[
entrez_to_hugo_biomaRt$hgnc_symbol != "", ]
# Check how many Entrez ID's remain unmapped
if (nrow(entrez_to_hugo_biomaRt) != length(
unique(gene_counts$entrezgene_id))) {
miss <- (length(unique(gene_counts$entrezgene_id)) - nrow(entrez_to_hugo_biomaRt))
sprintf("%i Entrez ID's missing HUGO symbols.", miss)
}
## [1] "277 Entrez ID's missing HUGO symbols."
This is because biomaRt first maps Entrez IDs to Ensembl IDs, then to HUGO symbols. However there does not exist a simple 1-to-1 mapping of Entrez IDs to Ensembl ID. Hence we are not getting a full translation of entrez ID. (Sudbery 2017)
We need to try an alternative database.
org.Hs.eg.db is a genome wide annotation for Human, primarily based on mapping using Entrez Gene identifiers. Therefore it might has a better mapping than biomaRt has.
entrez_to_hugo <- annotate::getSYMBOL(x = as.character(gene_counts$entrezgene_id),
data = 'org.Hs.eg')
if (length(entrez_to_hugo[!is.na(entrez_to_hugo)]) != length(unique(gene_counts$entrezgene_id))) {
miss <- (length(unique(gene_counts$entrezgene_id)) -
length(entrez_to_hugo[!is.na(entrez_to_hugo)])
)
sprintf("%i Entrez ID's missing gene names.", miss)
}
## [1] "3 Entrez ID's missing gene names."
This has a better coverage…for gene names.
# Convert the mapping from a named vector into a dataframe
# so that we can use merge() to map
entrez_to_hugo <- data.frame(
entrezgene_id = as.integer(names(entrez_to_hugo)),
gname = entrez_to_hugo
)
# Mapping by Entrez ID
gene_counts <- merge(gene_counts,
entrez_to_hugo,
by = "entrezgene_id"
)
# Reorder columns
gene_counts <- gene_counts[, c(1, 88, 2:87)]
# Find Entrez ID's that were not mappped
gene_counts[is.na(gene_counts$gname), ]$entrezgene_id
## [1] 10638 285464 388289
We can check the remaining 3 unmapped genes manually:
| Entrez ID | Gene Symbol | Status |
|---|---|---|
| 10638 | SPHAR (HUGO) | Withdrawn by NCBI. Defined by BC107681.1, which review shows corresponds to the 3’ UTR of GeneID:5867 and not an independent gene. |
| 285464 | CRIPAK (HUGO) | Withdrawn by NCBI, after discussions with CCDS collaborators. It was decided that this locus is not an independent gene. |
| 388289 | C16orf47 | Replaced with Gene ID: 463, HUGO symbol: ZFHX3. |
Above information was obtained from NCBI database service. Although SPHAR corresponds to an UTR of another gene, I will retain it for now since we are basing the analysis on the Entrez ID’s. I also decided to retain CRIPAK and OCLM for the same reason, despite the fact that they are no longer deemed to be an independent genes.
We manualy label these genes:
gene_counts[gene_counts$entrezgene_id == 10638, "gname"] <- "SPHAR"
gene_counts[gene_counts$entrezgene_id == 285464, "gname"] <- "CRIPAK"
gene_counts[gene_counts$entrezgene_id == 388289, "gname"] <- "ZFHX3"
The experiment was performed for two groups: healthy controls and patients with COVID-19:
samples_by_group <- grepl(pattern = "^ESC", colnames(gene_counts[3:88]))
samples_by_group <- data.frame(samples_by_group)
rownames(samples_by_group) <- colnames(gene_counts[3:88])
colnames(samples_by_group) <- "group"
samples_by_group$group <- ifelse(samples_by_group$group, "COVID-19", "HC")
samples_by_group <- data.frame(
sample = rownames(samples_by_group),
group = samples_by_group$group
)
gene_freq <- table(gene_counts$gname)
gene_freq[which(gene_freq > 1)]
## ZFHX3
## 2
Using edgeR to filter genes with low counts:
cpms <- edgeR::cpm(gene_counts[,3:88])
rownames(cpms) <- gene_counts$entrezgene_id
In edgeR, it is recommended to remove features without at least 1 read per million in n of the samples, where n is the size of the smallest group of replicates. (Anders et al. 2013) For this data set, we have only two groups: 24 healthy human controls, 62 COVID-19 patients. Hence 24 is the size is the smallest group of biological replicates here.
keep <- (rowSums(cpms > 1) >= 24)
gene_counts_filtered <- gene_counts[keep, ]
summarised_gene_counts_filtered <- sort(table(gene_counts_filtered$gname),
decreasing = TRUE)
summarised_gene_counts_filtered[which(summarised_gene_counts_filtered > 1)]
## ZFHX3
## 2
We have removed 6034 genes from the data set, but the duplicate gene persist. Since this is the only one left, we can check it manually.
duplicate_genes <- data.frame(
lapply(
names(
summarised_gene_counts_filtered[
which(summarised_gene_counts_filtered > 1)
]),
function(x){
gene_counts[ gene_counts$gname == x , ]$entrezgene_id
}
)
)
colnames(duplicate_genes) <- names(
summarised_gene_counts_filtered[
which(summarised_gene_counts_filtered > 1)
]
)
duplicate_genes
If we look up this gene on NCBI Gene database:
| Entrez ID | Gene Symbol | Status |
|---|---|---|
| 388289 | C16orf47 | Replaced with Gene ID: 463, HUGO symbol: ZFHX3 |
We can see these 2 different Entrez ID’s are referred to by the same HUGO symbol.
Let’s compare the counts of these duplicate genes to see if the duplicate record should be removed:
for (i in seq_along(duplicate_genes)) {
counts <- gene_counts_filtered[
gene_counts_filtered$gname == names(duplicate_genes[i]),
3:88]
# compare expression levels of duplicates
if (all(counts[1, ] == counts[2, ])) {
print(paste(names(duplicate_genes[i]),
"has 2 the same records of RNA-seq counts for all sample.")
)
} else {
diff_samples <- colnames(gene_counts)[which(counts[1,] != counts[2,])]
print(paste(names(duplicate_genes[i]),
"has different counts for",
length(diff_samples), "samples.")
)
}
}
## [1] "ZFHX3 has 2 the same records of RNA-seq counts for all sample."
This means the duplicate is safe to remove, as they are the same gene having the same expression level. We remove the duplicate with the obsolete Entrez ID:
counts_to_remove <- as.integer(duplicate_genes[2, ])
gene_counts_filtered <- gene_counts_filtered[!(
gene_counts_filtered$entrezgene_id %in% counts_to_remove
), ]
The median of the medians of all distributions is labeled as a green dash line.
# Calculate log2 fold change
log2_cpms <- log2(edgeR::cpm(gene_counts_filtered[3:88]))
boxplot(log2_cpms,
xlab = "Samples",
ylab = "log2 CPM",
las = 2,
cex = 0.5,
cex.lab = 0.5,
cex.axis = 0.25,
main = "RNA-seq Samples of COVID-19 Patients and Heathy Controls")
#draw the median on each box plot
abline(h = median(apply(log2_cpms, 2, median)),
col = "green",
lwd = 0.75,
lty = "dashed")
Estimate the density for the distribution for gene expression for each sample:
# Calculate Gaussian kernel density estimates of log2 fold change cpm
log2_cpms_density <- apply(log2(edgeR::cpm(gene_counts_filtered[, 3:88])),
2,
density)
# calculate the limits across all the samples
xlim <- 0
ylim <- 0
for (i in seq_along(log2_cpms_density)) {
xlim <- range(c(xlim, log2_cpms_density[[i]]$x))
ylim <- range(c(ylim, log2_cpms_density[[i]]$y))
}
# 86 line colours for all 86 samples
cols <- rainbow(length(log2_cpms_density))
# choose line type 1 for density graphs of all 86 samples
ltys <- rep(1, length(log2_cpms_density))
# Initialize the density plot without density lines
plot(log2_cpms_density[[1]],
xlim = xlim,
ylim = ylim,
type = "n",
ylab = "Smoothing density of log2-CPM",
main = "Distributions for log2 CPM's of each sample",
cex.lab = 0.75
)
# plot each line
for (i in seq_along(log2_cpms_density)) {
lines(log2_cpms_density[[i]],
col = cols[i])
}
# create legend
legend("topright",
colnames(log2_cpms),
col = cols,
lty = ltys,
cex = 0.15,
border = "blue",
text.col = "green4",
merge = TRUE,
bg = "gray90")
Create an edgeR data object to contain RNASeq count data, since edgeR functions primarily work with DGEList.
gene_counts_filtered_matrix <- as.matrix(gene_counts_filtered[3:88])
rownames(gene_counts_filtered_matrix) <- gene_counts_filtered$entrezgene_id
# matrix to DGEList to normalise data for edgeR
gene_counts_filtered_toNormalise <- edgeR::DGEList(
counts = gene_counts_filtered_matrix,
group = samples_by_group$group)
Compute the normalization factor with edgeR:
gene_counts_filtered_toNormalise <- edgeR::calcNormFactors(gene_counts_filtered_toNormalise)
Get the normalised counts:
gene_counts_filtered_normalised <- edgeR::cpm(gene_counts_filtered_toNormalise)
log2_cpms_normalised <- log2(gene_counts_filtered_normalised)
def_par <- par(no.readonly = TRUE)
par(mfrow = c(1, 2))
boxplot(log2_cpms,
xlab = "Samples",
ylab = "log2 CPM",
las = 2,
cex = 0.5,
cex.lab = 0.75,
cex.axis = 0.25,
main = "RNA-seq Samples of COVID-19 Patients and Heathy Controls",
cex.main = 0.65)
#draw the median on each box plot
abline(h = median(apply(log2_cpms, 2, median)),
col = "green",
lwd = 0.75,
lty = "dashed")
boxplot(log2_cpms_normalised,
xlab = "Samples",
ylab = "log2 CPM",
las = 2,
cex = 0.25,
cex.lab = 0.75,
cex.axis = 0.25,
main = "Distributions for TMM Normalsed Counts of RNA-seq Samples",
cex.main = 0.6)
# draw the median on each box plot
abline(h = median(
apply(
log2_cpms_normalised,
MARGIN = 2,
FUN = median)
),
col = "green",
lwd = 0.75,
lty = "dashed")
We observe that the medians of distributions for normalised RNA-seq counts are much closer to each other. The interquartile ranges for the gene expression levels of each sample also become more consistent after normalisation applied. This means 50% of gene expression levels have similar distributions for each sample.
par(mfrow = c(1, 2))
# calculate the limits across all the samples
xlim <- 0
ylim <- 0
for (i in seq_along(log2_cpms_density)) {
xlim <- range(c(xlim, log2_cpms_density[[i]]$x))
ylim <- range(c(ylim, log2_cpms_density[[i]]$y))
}
# 86 line colours for all 86 samples
cols <- rainbow(length(log2_cpms_density))
# choose line type 1 for density graphs of all 86 samples
ltys <- rep(1, length(log2_cpms_density))
# Initialize the density plot without density lines
plot(log2_cpms_density[[1]],
xlim = xlim,
ylim = ylim,
type = "n",
ylab = "Smoothing density of log2-CPM",
main = "Distribution for Gene Expression Levels of Each Sample",
cex.main = 0.7,
cex.lab = 0.75
)
# plot smoothing density for each sample
for (i in seq_along(log2_cpms_density)) {
lines(log2_cpms_density[[i]],
col = cols[i])
}
# create legend for samples
legend("topright",
colnames(log2_cpms),
col = cols,
lty = ltys,
cex = 0.15,
border = "blue",
text.col = "green4",
merge = TRUE,
bg = "gray90")
# Get the normalised fold change cpm
log2_cpms_normalised_density <- apply(log2_cpms_normalised,
MARGIN = 2,
FUN = density)
# Initialize the density plot without density lines
plot(log2_cpms_normalised_density[[1]],
xlim = xlim,
ylim = ylim,
type = "n",
ylab = "Smoothing density of log2-CPM",
main = "Distributions for Normalised Expression Levels of Each Sample",
cex.main = 0.65,
cex.lab = 0.75,
)
# plot smoothing density for each sample
for (i in seq_along(log2_cpms_normalised_density)) {
lines(log2_cpms_normalised_density[[i]],
col = cols[i])
}
# create legend for samples
legend("topright",
colnames(log2_cpms_normalised),
col = cols,
lty = ltys,
cex = 0.15,
border = "blue",
text.col = "green4",
merge = TRUE,
bg = "gray90")
The variations between the distributions of all samples for the normalised gene expression values become less, as the spread becomes narrower.
Plot an MDS plot using limma package
limma::plotMDS(gene_counts_filtered_normalised,
labels = samples_by_group$sample,
col = c("darkgreen","blue")[factor(samples_by_group$group)],
cex.axis = 0.75,
cex.lab = 0.75,
main="Dissimilarity between Samples in Expression Level (Fold Change)")
The distances between samples on the MDS plot suggest how different these samples are in terms of log fold changes of the expression levels of our 14421 genes. We can see the patient group and the healthy control group are seperated into two clusters, although the distance between the clusters are not far apart and there are a few samples of the two different groups mixed up.
Visualised with the BCV plot. Since we attempt to capture the variability in gene expressions between group, we use group as target to build a model matrix:
gene_counts_filtered_normalised_DGEList <- edgeR::DGEList(
counts = gene_counts_filtered_normalised,
group = samples_by_group$group)
# Regression model with group as target as samples as predictors
model_design <- model.matrix(~group, data = samples_by_group)
com_dispersion <- edgeR::estimateDisp(gene_counts_filtered_normalised_DGEList,
design = model_design)
edgeR::plotBCV(com_dispersion,
col.tagwise = "black",
col.common = "red",
main = "Common and Tag-wise Dispersion vs Expression Levels")
Since the dots represent tag-wise dispersions that are gene-specific, the fact that we observe dots having 0 BCV at the bottom suggests that there exist genes not showing any evidence of differences between our biological replicates. An increasing trend of BCV is present for increasing expression level; this is unexpected since genes with more counts should have smaller variations between samples than genes with fewer counts. The common dispersion falls in the range between 0.25 to 0.3 in the plot, meaning variability across all genes for every sample is between these values.
edgeR::plotMeanVar(com_dispersion,
show.raw.vars = TRUE,
show.tagwise.vars = TRUE,
NBline = FALSE,
show.ave.raw.vars = FALSE,
show.binned.common.disp.vars = TRUE,
main = paste("Mean-variance Relationship for 14421 Genes of",
"COVID-19 Group and Healthy Control Group"),
cex.main = 0.8
)
We observe a positive linear correlation between the mean gene expression levels for 86 samples and the pooled gene-level variance on a log10 scale.
Note that the identifers returned by org.Hs.eg.db are NOT all in HUGO format:
gene_counts_filtered[grep(pattern = "^LOC", gene_counts_filtered$gname), 2][1:5]
## [1] "LOC283710" "LOC284930" "LOC388813" "LOC399900" "LOC400499"
But at least we have found gene names for every gene we have.
The normalised data is stored as a matrix. We make it into a data frame first, and then split it into 2 data frames: one where every gene has a unique HUGO symbol, and the other with genes missing HUBO symbols.
For the mapping, we use the query result from biomaRt:
gene_counts_filtered_normalised <- as.data.frame(gene_counts_filtered_normalised)
gene_counts_filtered_normalised <- data.frame(
entrezgene_id = as.integer(rownames(gene_counts_filtered_normalised)),
gene_counts_filtered_normalised
)
Map identifiers using the biomaRt mapping:
finalized_normalized_counts <- merge(
gene_counts_filtered_normalised,
entrez_to_hugo_biomaRt,
by = "entrezgene_id"
)[ , c(1, 88, 2:87)] # re-order column order
Seperate out genes that cannot be mapped to the current HUGO symbols:
missing_ids_subset <- setdiff(
gene_counts_filtered_normalised$entrezgene_id,
finalized_normalized_counts$entrezgene_id)
missing_ids_subset <- gene_counts_filtered_normalised[
which(gene_counts_filtered_normalised$entrezgene_id %in% missing_ids_subset),
]
# get mapping for all genes from pre-normalised data frame
entrez_to_gname <- gene_counts_filtered[ , 1:2]
missing_ids_subset_withids <- merge(
missing_ids_subset,
entrez_to_gname,
by = "entrezgene_id"
)[ , c(1, 88, 2:87)] # re-order column order
But in fact, there are HUGO symbols that biomaRt failed to find, while they can be obtained by org.Hs.eg.db…
missing_ids_subset_withids
Take them out one-by-one…by hand…
isHUGO <- c("FCGR1B", "SLC22A18AS", "FCGR2C", "SSPO", "TUG1",
"FAM86C1", "PRRC2B", "BAGE2", "PGBD3", "ALG1L9P", "CRIPAK",
"KIAA1107", "SLED1", "KIAA0754", "ANKRD20A4", "GOLGA6L19",
"THRA1/BTR", "ZNF8-ERVK3-1", "SREBF2-AS1")
Note that the HUGO symbol for THRA1/BTR is actually just THRA1. We will change this by hand later.
# Genes that can be mapped to current HUGO symbols but in the wrong data frame
hasHUGO_genes <- missing_ids_subset_withids[
missing_ids_subset_withids$gname %in% isHUGO,
]
# Take them out from the wrong dataframe for non HUGO genes
missing_ids_subset_withids <- missing_ids_subset_withids[
-which(missing_ids_subset_withids$gname %in% isHUGO),]
# Add these genes to HUGO dataframe
colnames(hasHUGO_genes)[2] <- colnames(finalized_normalized_counts)[2]
finalized_normalized_counts <- rbind(finalized_normalized_counts,
hasHUGO_genes)
# Check if the numbers of genes in the two data frames
# add up to the total genes after removing outliers
nrow(finalized_normalized_counts) +
nrow(missing_ids_subset_withids) ==
nrow(gene_counts_filtered_normalised)
## [1] FALSE
The numbers of genes don’t add up. We need to find out what happened.
gene_freq_finalised <- table(finalized_normalized_counts$hgnc_symbol)
gene_freq_finalised[which(gene_freq_finalised > 1)]
##
## APOBEC3A ARL17B ASB3 BCAR3 CCL3L1 CCL3L3 CCL4L2 CEP170
## 2 3 2 2 2 2 2 2
## CHML CHTF8 CYP2D6 DAB2 DAXX DLEU7 EDEM2 FNTB
## 2 2 3 2 2 2 2 2
## GNMT GOLGA8Q GPRASP2 GTF2H2 HLA-DQA2 KLHL23 KLRK1 LILRB3
## 2 2 2 2 2 2 2 4
## LINC00511 MICAL2 MRPL23 NHSL2 NOMO2 OMG PAK6 PDE10A
## 2 2 2 2 2 2 2 2
## PDXK PPP1R18 S1PR3 SAMSN1 SCNM1 SMN2 SPDYE16 TAS2R14
## 2 2 2 2 2 2 2 2
## TAS2R43 TBC1D3L TBC1D7 TMOD3 TRABD2A USP4 USP9Y WNT3
## 2 2 2 2 2 2 2 2
## ZNF117 ZNF197 ZNF761
## 2 2 2
This is unexpected. I believe this has to do with the mapping by biomaRt, since we have cleaned up the duplicate already.
finalized_normalized_counts[finalized_normalized_counts$hgnc_symbol == "APOBEC3A", 1]
## [1] 200315 100913187
If we look up these 2 Entrez ID’s on NCBI Gene, we will find that 200315 maps to APOBEC3A, while 100913187 maps to APOBEC3A_B.
entrez_to_hugo_biomaRt[entrez_to_hugo_biomaRt$entrezgene_id == 100913187, 2]
## [1] "APOBEC3A"
biomaRt incorrectly maps 100913187 to APOBEC3A.
entrez_to_hugo[
entrez_to_hugo$entrezgene_id == 100913187, 2]
## [1] "APOBEC3A_B"
The mapping by org.Hs.eg.db however does map this Entrez ID correctly, although the mapping also inclues symbols that are not HUGO.
This means we can “filter” out genes that have no HUGO symbols using the biomaRt mapping by Entrez ID’s, and map the Entrez ID’s using the mapping by org.Hs.eg.db.
# Re-create dataframes
finalized_normalized_counts <- NA
missing_ids_subset_withids <- NA
# genes that can be mapped to HUGO symbols by biomaRt
hasHUGO_id <- intersect(gene_counts_filtered_normalised$entrezgene_id,
entrez_to_hugo_biomaRt$entrezgene_id)
# Data frame to store genes that can be mapped to current HUGO symbols
finalized_normalized_counts <- gene_counts_filtered_normalised[
which(gene_counts_filtered_normalised$entrezgene_id %in% hasHUGO_id),]
# mapping identifier using the mapping from pre-normalised dataframe
finalized_normalized_counts <- merge(
finalized_normalized_counts,
entrez_to_gname,
by = "entrezgene_id"
)[, c(1, 88, 2:87)]
# gname -> hugo__symbol
colnames(finalized_normalized_counts)[2] <- colnames(entrez_to_hugo_biomaRt)[2]
# Entrez ID's that cannot be mapped to current HUGO symbols by bioimaRt
missing_ids_subset <- setdiff(
gene_counts_filtered_normalised$entrezgene_id,
hasHUGO_id
)
# Genes with Entrez ID's unmappped
missing_ids_subset_withids <- gene_counts_filtered_normalised[
which(
gene_counts_filtered_normalised$entrezgene_id %in% missing_ids_subset)
, ]
# map by the old mapping
missing_ids_subset_withids <- merge(
missing_ids_subset_withids,
entrez_to_gname
)[, c(1, 88, 2:87)]
Check whether there are genes with HUGO symbols in missing_ids_subset_withids:
missing_ids_subset_withids
The genes with HUGO symbols here are the same as those we previously took out.
# Genes that have HUGO symbols but in the wrong data frame
hasHUGO_genes <- missing_ids_subset_withids[
missing_ids_subset_withids$gname %in% isHUGO,
]
# Remove HUGO genes from dataframe for genes with no HUGO symbols
missing_ids_subset_withids <- missing_ids_subset_withids[
-which(missing_ids_subset_withids$gname %in% isHUGO),]
# Add these genes to the HUGO dataframe
colnames(hasHUGO_genes)[2] <- colnames(finalized_normalized_counts)[2]
finalized_normalized_counts <- rbind(finalized_normalized_counts,
hasHUGO_genes)
# Check if the numbers add up
nrow(finalized_normalized_counts) +
nrow(missing_ids_subset_withids) ==
nrow(gene_counts_filtered_normalised)
## [1] TRUE
Now the numbers add up. We change the HUGO symbol THRA1/BTR to THRA1 to match the official HUGO record.
finalized_normalized_counts[
finalized_normalized_counts$hgnc_symbol == "THRA1/BTR",
2
] <- "THRA1"
# Check duplicate genes in finalised HUGO data frame
gene_freq_finalised <- table(finalized_normalized_counts$hgnc_symbol)
hasDuplicate <- length(which(gene_freq_finalised > 1)) > 0
if (hasDuplicate) {
print("Duplicate HUGO symbols present in the finalised data frame.")
} else {
print("HUGO symbols are unique in the finalised data frame.")
}
## [1] "HUGO symbols are unique in the finalised data frame."
# Check duplicate genes in dataframe without HUGO
gene_freq_noHUGO <- table(missing_ids_subset_withids$gname)
hasDuplicate <- length(which(gene_freq_noHUGO > 1)) > 0
if (hasDuplicate) {
print("Duplicate gene names present in the finalised data frame.")
} else {
print("Gene names are unique in the finalised data frame.")
}
## [1] "Gene names are unique in the finalised data frame."
# Confirm every gene is mapped in HUGO data frame
hasUnmapped <- length(finalized_normalized_counts[
is.na(finalized_normalized_counts$hgnc_symbol) |
finalized_normalized_counts$hgnc_symbol == "", 1]
) > 0
if (hasUnmapped) {
print("Unmapped Entrez ID present in the finalised data frame.")
} else {
print("Every Entrez ID is mapped in the finalised data frame.")
}
## [1] "Every Entrez ID is mapped in the finalised data frame."
# Confirm every gene is mappped in non-HUGO name data frame
hasUnmapped <- length(missing_ids_subset_withids[
is.na(missing_ids_subset_withids$gname) |
missing_ids_subset_withids$gname == "", 1]
) > 0
if (hasUnmapped) {
print("Unmapped gene name present.")
} else {
print("Every gene name is mapped.")
}
## [1] "Every gene name is mapped."
We have successfully seperated genes that have unique HUGO symbols and genes that do not. Make HUGO symbol symbols as row names and keep only the biological replicates as columns:
# HUGO symbols to row names
rownames(finalized_normalized_counts) <- finalized_normalized_counts$hgnc_symbol
# Remove columns that are not samples
finalized_normalized_counts <- finalized_normalized_counts[ , 3:88]
# gene names as row names
rownames(missing_ids_subset_withids) <- missing_ids_subset_withids$gname
# Remove columns that are not samples
missing_ids_subset_withids <- missing_ids_subset_withids[ , 3:88]
The finalised data frames are printed after the data intrepretation section.
What are the control and test conditions of the dataset?
The control condition is healthy humans, and the test condition is patients with SARS-Cov-2 infection within the first 24 hours of hospital admission.
Why is the dataset of interest to you?
The reason is that this dataset directly compares the expression of genes whose expressions in patients infected by the other 6 viruses (influenza, RSV, HRV, Ebola, Dengue, and SARS1) have been previously studied in healthy humans and COVID-19 patients. It attempts to capture the genes that express in COVID-19 patients differently from healthy individuals.
Were there expression values that were not unique for specific genes? How did you handle these?
After trimming and normalising the RNA-seq counts, there are still 51 expression values not unique to genes. However, since their expression values are not unique and we have no information about the genes they are specific to, we have no “keys” to locate them in the data frame, and we wouldn’t know which gene has the same expression level as which. Therefore I chose not to handle these records.
Were there expression values that could not be mapped to current HUGO symbols?
Yes, and the genes these values specific to were stored in the missing_ids_subset_withids data frame, where their gene names were specified. Particularly, there were 3 genes with Entrez ID 10638, 285464 and 388289 could not be mapped to the current HUGO symbol, but then I mapped them manually. The reason why they could not be mapped is that, for the first 2 genes, they were withdrawn by NCBI and the last replaced with an updated Entrez ID.
How many outliers were removed?
The outliers were removed according to the edgeR protocol. (Anders et al. 2013) This means if a gene has fewer than 10 expression values that are greater than 1 count per million, then it will be considered as an outlier and removed. There are 6035 such outliers removed.
How did you handle replicates?
There was only one gene replicate ZFHX3. The reason why it has replicates is that its current Entrez ID and obsolete Entrez ID were both present in the dataset. Since the expression values for these 2 ID’s are also identical, it means that this gene was measured twice and hence I removed the record with the obsolete Entrez ID.
What is the final coverage of your dataset?
The final coverage is 70.5 percent after removing the outliers. If we exclude genes that cannot be mapped to unique HUGO symbols, the coverage will still be 70.1 percent.